Final Project Rmarkdown File

Group Members: Adam Gonzalez, Jon Le (4pm section), Erin Lee, Debbie Lu & Corinne Smith

1) CLEAR ENVIRONMENT

# code to remove objects in Environment before knitting
rm(list = ls())

2) LOAD THE DATA

Data sets: “books” is from https://www.kaggle.com/jealousleopard/goodreadsbooks “books_two” is from https://www.kaggle.com/choobani/goodread-authors?select=final_dataset.csv

books <- read.csv(here::here("datasets", "books.csv"))
books_two <- read.csv(here::here("datasets", "final_dataset.csv"))

3) LOAD LIBRARIES

# ---------------------------------------
library('yardstick')
# ---------------------------------------

# ---------------------------------------
# data visualization
# --------------------------------------
library('ggplot2')
library('plotly')
library('gganimate')
library('ggridges')

# ---------------------------------------
# data manipulation
# --------------------------------------
library('forcats')
library('tidyverse')
library('magrittr')
library('lubridate')
library('dplyr')
library('DT')
#install.packages("formattable")
#install.packages("tidyr")
library('formattable')
library('tidyr')
library('data.table')
library('kableExtra')

# ---------------------------------------
# sentiment analysis
# --------------------------------------
library('sentimentr')

# ---------------------------------------
# summary statistics
# --------------------------------------
#install.packages("qwraps2")
library("qwraps2")

# ---------------------------------------
# model validation library
# ---------------------------------------
library('rsample')

# ---------------------------------------
# generalized linear model libraries
# ---------------------------------------
library('glmnet')
library('glmnetUtils')

# ---------------------------------------
# regression output
# ---------------------------------------
# install.packages('sjPlot')
library('sjPlot')
# install.packages('sjPlot')
library('tidymodels')

# ---------------------------------------
# random forest libraries
# ---------------------------------------
library('partykit')
#library('tidyverse')
library('PerformanceAnalytics')
library('rpart')      
library('rpart.plot')  
library('randomForest')
#install.packages("randomForestExplainer")
library('randomForestExplainer')

# ---------------------------------------
# lasso libraries
# ---------------------------------------
library('broom')
library('coefplot')
# ---------------------------------------
books <- books %>% rename(avg_book_rating = average_rating,
                          book_ratings_count = ratings_count,
                          author = authors)
books_two <- books_two %>% rename(author = name,
                                  authorworkcount = workcount,
                                  author_fans = fan_count,
                                  avg_author_rating = average_rate,
                                  author_ratings_count = rating_count,
                                  author_review_count = review_count,
                                  )

4) SENTIMENT ANALYSIS

Need ‘sentimentr’ library.

sentiment_DF <- get_sentences(books$title) %>% sentiment_by(books$title)
head(sentiment_DF)
##                                                                                                                            title
## 1:                                                                                                                              
## 2:                                                                                                 said the shotgun to the head.
## 3: $30 Film School: How to Write  Direct  Produce  Shoot  Edit  Distribute  Tour With  and Sell Your Own No-Budget Digital Movie
## 4:                                                                                                                  'Salem's Lot
## 5:                                                                                            1 000 Places to See Before You Die
## 6:                                                                                                                 10 lb Penalty
##    word_count sd ave_sentiment
## 1:          0  0     0.0000000
## 2:          6 NA    -0.1632993
## 3:         20 NA    -0.1118034
## 4:         16  0     0.0000000
## 5:          6 NA    -0.3061862
## 6:          2 NA    -0.5303301

5) MERGING

6) DATA CLEANING

# mutate to correct column data types
books_1 <- books_sa %>% mutate(num_pages = as.numeric(num_pages),
                                     avg_book_rating = as.numeric(avg_book_rating),
                                    text_reviews_count = as.numeric(text_reviews_count), 
                                    publication_date = as.Date(publication_date, format="%m/%d/%Y"),
                                    born = as.Date(born, format="%m/%d/%Y"),
                                    died = as.Date(died, format="%m/%d/%Y"),
                                    gender = as.factor(gender)
                                    ) 


# remove NAs
books_total <- books_1 %>% 
  filter(
    (!is.na(avg_book_rating)), (!is.na(book_ratings_count)), (!is.na(text_reviews_count)), (!is.na(publication_date)), 
    (!duplicated(title)), 
    (avg_book_rating != 0), 
    (author != "NOT A BOOK"),
    (!is_greater_than(num_pages, 2000)),
    (num_pages != 0),
    (bookID != 9796),
    (!is_less_than(num_pages, 10))
    )


# remove irrelevant variables (11):
# sd(standard deviation of words in title), author ID, image_URL, about, influence, website, twitter, original hometown, country, latitude, longitude

books_corti <- books_total %>% select(-isbn13,
                                      -sd,
                                      -authorid,
                                      -image_url,
                                      -about,
                                      -influence,
                                      -website,
                                      -twitter,
                                      -original_hometown,
                                      -country,
                                      -latitude,
                                      -longitude) %>% rename(
                                                      title_sentiment_avg = ave_sentiment,
                                                      title_word_count = word_count
                                      )
# View(books_corti)

7) DATA EXPLORATION

# NA VISUALIZATION
# to see the number of missing values in each column

# STEPS:
# 1) We need to sum through every column using a FOR loop.
# 2) Then print the variable name using names(movies[i]).
# 3) Finally, we print the sum of is.na() for just that variable.

# FOR loop to see each column in books data set
for(i in 1:ncol(books_corti)){
  
  # print the following
  print(
    
    # first print "Variable: "
    paste0("Variable: ", 
           
           # then print the variable name, then "NAs: "
           names(books_corti)[i], " NAs: ", 
           
           # then print the sum of the number of missing values 
           # for that variable
           sum(is.na(books_corti %>% select(i)))
          )
        
        )
}
## [1] "Variable: bookID NAs: 0"
## [1] "Variable: title NAs: 0"
## [1] "Variable: author NAs: 0"
## [1] "Variable: avg_book_rating NAs: 0"
## [1] "Variable: isbn NAs: 0"
## [1] "Variable: language_code NAs: 0"
## [1] "Variable: num_pages NAs: 0"
## [1] "Variable: book_ratings_count NAs: 0"
## [1] "Variable: text_reviews_count NAs: 0"
## [1] "Variable: publication_date NAs: 0"
## [1] "Variable: publisher NAs: 0"
## [1] "Variable: title_word_count NAs: 0"
## [1] "Variable: title_sentiment_avg NAs: 0"
## [1] "Variable: authorworkcount NAs: 0"
## [1] "Variable: author_fans NAs: 0"
## [1] "Variable: gender NAs: 0"
## [1] "Variable: born NAs: 5914"
## [1] "Variable: died NAs: 5914"
## [1] "Variable: avg_author_rating NAs: 0"
## [1] "Variable: author_ratings_count NAs: 0"
## [1] "Variable: author_review_count NAs: 0"
## [1] "Variable: genre NAs: 0"
# starts_with() function for certain columns...

books_corti %>% select(starts_with("isbn")) %>% glimpse()
## Rows: 5,914
## Columns: 1
## $ isbn <chr> "0439554896", "0517226952", "0345453743", "1400052920", "0517149…
# exploring first 10 rows using slice() function

explore_data <- books_corti %>% arrange(desc(avg_book_rating)) %>% slice(1:10) %>% select(title, author, avg_book_rating)
print(explore_data)
##                                                         title
## 1  Zone of the Enders: The 2nd Runner Official Strategy Guide
## 2     The Diamond Color Meditation: Color Pathway to the Soul
## 3                                   Taxation of Mineral Rents
## 4               The Irish Anatomist: A Study of Flann O'Brien
## 5   His Princess Devotional: A Royal Encounter With Your King
## 6                                     Stargirl LitPlans on CD
## 7                              The Complete Calvin and Hobbes
## 8        Wissenschaft der Logik: Die Lehre Vom Begriff (1816)
## 9               It's a Magical World (Calvin and Hobbes  #11)
## 10         Homicidal Psycho Jungle Cat (Calvin and Hobbes #9)
##                           author avg_book_rating
## 1                     Tim Bogenn            5.00
## 2                  John  Diamond            5.00
## 3                   Ross Garnaut            5.00
## 4                  Keith Donohue            5.00
## 5            Sheri Rose Shepherd            5.00
## 6                Mary B. Collins            4.86
## 7                 Bill Watterson            4.82
## 8  Georg Wilhelm Friedrich Hegel            4.78
## 9                 Bill Watterson            4.76
## 10                Bill Watterson            4.72
datatable(books_corti)

Expectation:

Linear regression analysis is sensitive to outliers. Use histogram to see where this will occur.

ggplot(books_corti, aes(x = avg_book_rating)) + 
  xlab("Average Book Rating") + 
  ylab("Count") + 
  geom_histogram(fill = "skyblue", color = "#879bcd") + 
  theme_dark(base_size = 18) + 
  ggtitle("               Histogram to View Outliers")

p <- books_corti %>%
  ggplot(aes(avg_book_rating, title_sentiment_avg)) +
  xlab("Average Book Rating") + ylab("Title Sentiment") +
  geom_point(color = "skyblue", alpha = 1/2, size = 0.5) +
  theme_bw(base_size = 18) +
  ggtitle("Exploring the Data: Visualization 1")

ggplotly(p)
p <- ggplot(books_corti %>%
         mutate(genderMutated = fct_lump(gender, n = 10)),
       aes(x = avg_book_rating, y = genderMutated, fill = genderMutated)) +
  theme_minimal(base_size = 18) +
  geom_density_ridges(color="black") +
  xlab("Average Book Rating") +
  ylab("Gender of Author") +  
  ggtitle("   Exploring the Data: Visualization 2")

p + theme(legend.position = "none")

8) EXAMINING DATA STRUCTURE

str(books_corti)
## 'data.frame':    5914 obs. of  22 variables:
##  $ bookID              : chr  "4" "12" "13" "14" ...
##  $ title               : chr  "Harry Potter and the Chamber of Secrets (Harry Potter  #2)" "The Ultimate Hitchhiker's Guide: Five Complete Novels and One Story (Hitchhiker's Guide to the Galaxy  #1-5)" "The Ultimate Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy  #1-5)" "The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy  #1)" ...
##  $ author              : chr  "J.K. Rowling" "Douglas Adams" "Douglas Adams" "Douglas Adams" ...
##  $ avg_book_rating     : num  4.42 4.38 4.38 4.22 4.38 4.21 3.44 3.87 4.07 3.9 ...
##  $ isbn                : chr  "0439554896" "0517226952" "0345453743" "1400052920" ...
##  $ language_code       : chr  "eng" "eng" "eng" "eng" ...
##  $ num_pages           : num  352 815 815 215 815 544 55 256 335 304 ...
##  $ book_ratings_count  : int  6333 3628 249558 4930 2877 248558 7270 2088 72451 49240 ...
##  $ text_reviews_count  : num  244 254 4080 460 195 ...
##  $ publication_date    : Date, format: "2003-11-01" "2005-11-01" ...
##  $ publisher           : chr  "Scholastic" "Gramercy Books" "Del Rey Books" "Crown" ...
##  $ title_word_count    : int  18 15 12 44 9 12 8 12 4 14 ...
##  $ title_sentiment_avg : num  0 0.31 0.346 0.362 0.4 ...
##  $ authorworkcount     : int  242 103 103 103 103 59 59 59 59 59 ...
##  $ author_fans         : int  209174 19029 19029 19029 19029 14356 14356 14356 14356 14356 ...
##  $ gender              : Factor w/ 3 levels "female","male",..: 1 2 2 2 2 2 2 2 2 2 ...
##  $ born                : Date, format: NA NA ...
##  $ died                : Date, format: NA NA ...
##  $ avg_author_rating   : num  4.46 4.2 4.2 4.2 4.2 4.03 4.03 4.03 4.03 4.03 ...
##  $ author_ratings_count: int  24511114 2624222 2624222 2624222 2624222 1272002 1272002 1272002 1272002 1272002 ...
##  $ author_review_count : int  579250 57565 57565 57565 57565 76846 76846 76846 76846 76846 ...
##  $ genre               : chr  "fantasy,fiction,young adult" "comedy,fiction,mystery and thrillers" "comedy,fiction,mystery and thrillers" "comedy,fiction,mystery and thrillers" ...

9) SUMMARY STATS

options(qwraps2_markup = "markdown")
view(books_corti)
our_summary1 <-
    list("Average Book Rating" =
       list("min"       = ~ min(avg_book_rating),
            "mean"      = ~ mean(avg_book_rating),
            "max"       = ~ max(avg_book_rating),
            "st. dev"   = ~ sd(avg_book_rating)),
       "Number of Pages" =
       list("min"       = ~ min(num_pages),
            "mean"    = ~ mean(num_pages),
            "max"       = ~ max(num_pages),
            "st.dev" = ~ sd(num_pages)),
       "Book Ratings Count" =
       list("min"       = ~ min(book_ratings_count),
            "mean"      = ~ mean(book_ratings_count),
            "max"       = ~ max(book_ratings_count),
            "st. dev"   = ~ sd(book_ratings_count)),
        "Text Reviews Count" =
      list("min"       = ~ min(text_reviews_count),
            "mean"      = ~ mean(text_reviews_count),
            "max"       = ~ max(text_reviews_count),
            "st. dev"   = ~ sd(text_reviews_count)),
      "Average Title Sentiment Score" =
      list("min"       = ~ min(title_sentiment_avg),
            "mean"      = ~ mean(title_sentiment_avg),
            "max"       = ~ max(title_sentiment_avg),
            "st. dev"   = ~ sd(title_sentiment_avg)),
      "Author's Work Count" =
      list("min"       = ~ min(authorworkcount),
            "mean"      = ~ mean(authorworkcount),
            "max"       = ~ max(authorworkcount),
            "st. dev"   = ~ sd(authorworkcount)),
      "Author's Fan Count" =
      list("min"       = ~ min(author_fans),
            "mean"      = ~ mean(author_fans),
            "max"       = ~ max(author_fans),
            "st. dev"   = ~ sd(author_fans)),
      "Author Ratings Count" =
      list("min"       = ~ min(author_ratings_count),
            "mean"      = ~ mean(author_ratings_count),
            "max"       = ~ max(author_ratings_count),
            "st. dev"   = ~ sd(author_ratings_count)),
      "Author Review Count" =
      list("min"       = ~ min(author_review_count),
            "mean"      = ~ mean(author_review_count),
            "max"       = ~ max(author_review_count),
            "st. dev"   = ~ sd(author_review_count))
      )
sum_stats <- summary_table(books_corti, our_summary1) %>% round(1)

print(sum_stats)
## 
## 
## |                                  |books_corti (N = 5,914) |
## |:---------------------------------|:-----------------------|
## |**Average Book Rating**           |&nbsp;&nbsp;            |
## |&nbsp;&nbsp; min                  |1                       |
## |&nbsp;&nbsp; mean                 |3.9                     |
## |&nbsp;&nbsp; max                  |5                       |
## |&nbsp;&nbsp; st. dev              |0.3                     |
## |**Number of Pages**               |&nbsp;&nbsp;            |
## |&nbsp;&nbsp; min                  |10                      |
## |&nbsp;&nbsp; mean                 |350.4                   |
## |&nbsp;&nbsp; max                  |1952                    |
## |&nbsp;&nbsp; st.dev               |195.2                   |
## |**Book Ratings Count**            |&nbsp;&nbsp;            |
## |&nbsp;&nbsp; min                  |0                       |
## |&nbsp;&nbsp; mean                 |21480.8                 |
## |&nbsp;&nbsp; max                  |4597666                 |
## |&nbsp;&nbsp; st. dev              |120423.5                |
## |**Text Reviews Count**            |&nbsp;&nbsp;            |
## |&nbsp;&nbsp; min                  |0                       |
## |&nbsp;&nbsp; mean                 |696.7                   |
## |&nbsp;&nbsp; max                  |94265                   |
## |&nbsp;&nbsp; st. dev              |2844.8                  |
## |**Average Title Sentiment Score** |&nbsp;&nbsp;            |
## |&nbsp;&nbsp; min                  |-1.4                    |
## |&nbsp;&nbsp; mean                 |0                       |
## |&nbsp;&nbsp; max                  |1.3                     |
## |&nbsp;&nbsp; st. dev              |0.3                     |
## |**Author's Work Count**           |&nbsp;&nbsp;            |
## |&nbsp;&nbsp; min                  |1                       |
## |&nbsp;&nbsp; mean                 |231                     |
## |&nbsp;&nbsp; max                  |5204                    |
## |&nbsp;&nbsp; st. dev              |488.1                   |
## |**Author's Fan Count**            |&nbsp;&nbsp;            |
## |&nbsp;&nbsp; min                  |0                       |
## |&nbsp;&nbsp; mean                 |12050                   |
## |&nbsp;&nbsp; max                  |709826                  |
## |&nbsp;&nbsp; st. dev              |55558.6                 |
## |**Author Ratings Count**          |&nbsp;&nbsp;            |
## |&nbsp;&nbsp; min                  |27                      |
## |&nbsp;&nbsp; mean                 |658708                  |
## |&nbsp;&nbsp; max                  |24511114                |
## |&nbsp;&nbsp; st. dev              |1727055.4               |
## |**Author Review Count**           |&nbsp;&nbsp;            |
## |&nbsp;&nbsp; min                  |1                       |
## |&nbsp;&nbsp; mean                 |26511.1                 |
## |&nbsp;&nbsp; max                  |579250                  |
## |&nbsp;&nbsp; st. dev              |58813.9                 |

10) LINEAR MODEL VALIDATION: TRAIN-TEST-SPLIT

Need to load ‘rsample’ library here.

set.seed(1818)

train_prop <- 0.8

books_split <- initial_split(books_corti, prop = train_prop)

books_train <- training(books_split)
books_test <- testing(books_split)
nrow(books_train)
## [1] 4732
nrow(books_test)
## [1] 1182

11) MODEL 1: LINEAR REGRESSION

Need ‘dplyr’, ‘glmnet’, and ‘glmnetUtils’ libraries here.

options(scipen = 999)
mod1 <- lm(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count + gender, data = books_train)

summary(mod1)
## 
## Call:
## lm(formula = avg_book_rating ~ num_pages + book_ratings_count + 
##     text_reviews_count + title_sentiment_avg + authorworkcount + 
##     author_fans + author_ratings_count + author_review_count + 
##     gender, data = books_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8325 -0.1575  0.0139  0.1787  1.1391 
## 
## Coefficients:
##                            Estimate     Std. Error t value             Pr(>|t|)
## (Intercept)           3.79675179461  0.01058977825 358.530 < 0.0000000000000002
## num_pages             0.00027651722  0.00002059907  13.424 < 0.0000000000000002
## book_ratings_count   -0.00000005966  0.00000007126  -0.837             0.402464
## text_reviews_count    0.00000817665  0.00000320658   2.550             0.010805
## title_sentiment_avg   0.04097254653  0.01483542542   2.762             0.005771
## authorworkcount       0.00003276625  0.00000871417   3.760             0.000172
## author_fans           0.00000013706  0.00000013538   1.012             0.311392
## author_ratings_count  0.00000006167  0.00000000699   8.823 < 0.0000000000000002
## author_review_count  -0.00000172266  0.00000025130  -6.855     0.00000000000804
## gendermale            0.03361376024  0.00913831936   3.678             0.000237
## genderunknown         0.01812663016  0.01336993658   1.356             0.175236
##                         
## (Intercept)          ***
## num_pages            ***
## book_ratings_count      
## text_reviews_count   *  
## title_sentiment_avg  ** 
## authorworkcount      ***
## author_fans             
## author_ratings_count ***
## author_review_count  ***
## gendermale           ***
## genderunknown           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.275 on 4721 degrees of freedom
## Multiple R-squared:  0.06875,    Adjusted R-squared:  0.06678 
## F-statistic: 34.85 on 10 and 4721 DF,  p-value: < 0.00000000000000022

#——————————————————– # estimating “prettier” regression output #——————————————————–

Need ‘sjPlot’ and ‘tidymodels’ libraries.

#——————————————————– tab_model() outputs a table of results #——————————————————–

tab_model(mod1, digits = 3)
  avg book rating
Predictors Estimates CI p
(Intercept) 3.797 3.776 – 3.818 <0.001
num_pages 0.000 0.000 – 0.000 <0.001
book_ratings_count -0.000 -0.000 – 0.000 0.402
text_reviews_count 0.000 0.000 – 0.000 0.011
title_sentiment_avg 0.041 0.012 – 0.070 0.006
authorworkcount 0.000 0.000 – 0.000 <0.001
author_fans 0.000 -0.000 – 0.000 0.311
author_ratings_count 0.000 0.000 – 0.000 <0.001
author_review_count -0.000 -0.000 – -0.000 <0.001
gender [male] 0.034 0.016 – 0.052 <0.001
gender [unknown] 0.018 -0.008 – 0.044 0.175
Observations 4732
R2 / R2 adjusted 0.069 / 0.067

#——————————————————– plot_model() outputs a plot of regression coefficients #——————————————————–

plot_model(mod1)+ ylim(-0.1,0.1)  + ggtitle("            Average Book Rating Coefficients") + theme_minimal(base_size = 16)

#——————————————————– tidy() outputs a table of coefficients and their p-values, t-stats #——————————————————–

tidy(mod1)
## # A tibble: 11 x 5
##    term                      estimate     std.error statistic  p.value
##    <chr>                        <dbl>         <dbl>     <dbl>    <dbl>
##  1 (Intercept)           3.80         0.0106          359.    0.      
##  2 num_pages             0.000277     0.0000206        13.4   2.39e-40
##  3 book_ratings_count   -0.0000000597 0.0000000713     -0.837 4.02e- 1
##  4 text_reviews_count    0.00000818   0.00000321        2.55  1.08e- 2
##  5 title_sentiment_avg   0.0410       0.0148            2.76  5.77e- 3
##  6 authorworkcount       0.0000328    0.00000871        3.76  1.72e- 4
##  7 author_fans           0.000000137  0.000000135       1.01  3.11e- 1
##  8 author_ratings_count  0.0000000617 0.00000000699     8.82  1.55e-18
##  9 author_review_count  -0.00000172   0.000000251      -6.86  8.04e-12
## 10 gendermale            0.0336       0.00914           3.68  2.37e- 4
## 11 genderunknown         0.0181       0.0134            1.36  1.75e- 1

12) MODEL 2: ELASTIC NET

Note: We used an alpha sequence from 0 to 1 in steps of 0.1.

enet_mod <- cva.glmnet(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count + gender,
                       data = books_train,
                       alpha = seq(0,1, by = 0.1))
print(enet_mod)
## Call:
## cva.glmnet.formula(formula = avg_book_rating ~ num_pages + book_ratings_count + 
##     text_reviews_count + title_sentiment_avg + authorworkcount + 
##     author_fans + author_ratings_count + author_review_count + 
##     gender, data = books_train, alpha = seq(0, 1, by = 0.1))
## 
## Model fitting options:
##     Sparse model matrix: FALSE
##     Use model.frame: FALSE
##     Alpha values: 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1
##     Number of crossvalidation folds for lambda: 10
plot(enet_mod)

minlossplot(enet_mod, 
            cv.type = "min")

13) EXTRACT BEST LINEAR MODEL

# Use this function to find the best alpha.
get_alpha <- function(fit) {
  alpha <- fit$alpha
  error <- sapply(fit$modlist, 
                  function(mod) {min(mod$cvm)})
  alpha[which.min(error)]
}

# Get all parameters.
get_model_params <- function(fit) {
  alpha <- fit$alpha
  lambdaMin <- sapply(fit$modlist, `[[`, "lambda.min")
  lambdaSE <- sapply(fit$modlist, `[[`, "lambda.1se")
  error <- sapply(fit$modlist, function(mod) {min(mod$cvm)})
  best <- which.min(error)
  data.frame(alpha = alpha[best], lambdaMin = lambdaMin[best],
             lambdaSE = lambdaSE[best], error = error[best])
}

# Extract the best alpha value & model parameters.
best_alpha <- get_alpha(enet_mod)
print(best_alpha)
## [1] 1
get_model_params(enet_mod)
##   alpha    lambdaMin   lambdaSE      error
## 1     1 0.0001936006 0.01273763 0.07581943
# Extract the best model object.
best_mod <- enet_mod$modlist[[which(enet_mod$alpha == best_alpha)]]

14) MODEL 3: BEST ELASTIC NET MODEL

enet_best_mod <- cv.glmnet(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count + gender,
                       data = books_train,
                       alpha = 0.1)
summary(enet_best_mod)
##                 Length Class  Mode     
## lambda          92     -none- numeric  
## cvm             92     -none- numeric  
## cvsd            92     -none- numeric  
## cvup            92     -none- numeric  
## cvlo            92     -none- numeric  
## nzero           92     -none- numeric  
## call             4     -none- call     
## name             1     -none- character
## glmnet.fit      12     elnet  list     
## lambda.min       1     -none- numeric  
## lambda.1se       1     -none- numeric  
## terms            2     -none- call     
## xlev             9     -none- list     
## alpha            1     -none- numeric  
## nfolds           1     -none- numeric  
## sparse           1     -none- logical  
## use.model.frame  1     -none- logical  
## na.action        1     -none- character
print(enet_best_mod)
## Call:
## cv.glmnet.formula(formula = avg_book_rating ~ num_pages + book_ratings_count + 
##     text_reviews_count + title_sentiment_avg + authorworkcount + 
##     author_fans + author_ratings_count + author_review_count + 
##     gender, data = books_train, alpha = 0.1)
## 
## Model fitting options:
##     Sparse model matrix: FALSE
##     Use model.frame: FALSE
##     Number of crossvalidation folds: 10
##     Alpha: 0.1
##     Deviance-minimizing lambda: 0.0001430848  (+1 SE): 0.3229453

Print the model’s two suggested values of lambda.

print(enet_best_mod$lambda.min)
## [1] 0.0001430848
print(enet_best_mod$lambda.1se)
## [1] 0.3229453

Plot how the MSE varies as we vary lambda.

plot(enet_best_mod)

coefpath(enet_best_mod)

Compare lambda min & lambda 1SE…

# put into coefficient vector
enet_coefs <- data.frame(
  `lasso_min` = coef(enet_best_mod, s = enet_best_mod$lambda.min) %>%
    as.matrix() %>% data.frame() %>% round(3),
  `lasso_1se` = coef(enet_best_mod, s = enet_best_mod$lambda.1se) %>% 
    as.matrix() %>% data.frame() %>% round(3)
) %>%  rename(`lasso_min` = 1, `lasso_1se` = 2)

print(enet_coefs)
##                      lasso_min lasso_1se
## (Intercept)              3.815     3.903
## num_pages                0.000     0.000
## book_ratings_count       0.000     0.000
## text_reviews_count       0.000     0.000
## title_sentiment_avg      0.041     0.000
## authorworkcount          0.000     0.000
## author_fans              0.000     0.000
## author_ratings_count     0.000     0.000
## author_review_count      0.000     0.000
## genderfemale            -0.018     0.000
## gendermale               0.015     0.000
## genderunknown            0.000     0.000
enet_coefs %>% kable() %>% kable_styling()
lasso_min lasso_1se
(Intercept) 3.815 3.903
num_pages 0.000 0.000
book_ratings_count 0.000 0.000
text_reviews_count 0.000 0.000
title_sentiment_avg 0.041 0.000
authorworkcount 0.000 0.000
author_fans 0.000 0.000
author_ratings_count 0.000 0.000
author_review_count 0.000 0.000
genderfemale -0.018 0.000
gendermale 0.015 0.000
genderunknown 0.000 0.000

15) MODEL 3: BOOTSTRAP AGGREGATING (BAGGING)

Need ‘partykit’, ‘PerformanceAnalytics’, ‘rpart’, ‘rpart.plot’, and ‘randomForest’ libraries.

options(scipen = 10)
#set.seed(1818)
# store row names as columns
books_boot_preds <- books_corti %>% rownames_to_column() %>% 
  mutate(rowname = as.numeric(rowname))

B <- 100      # number of bootstrap samples
num_b <- 500  # sample size of each bootstrap
boot_mods <- list() # store our bagging models
for(i in 1:B){
  boot_idx <- sample(1:nrow(books_corti), 
                     size = num_b,
                     replace = FALSE)
  # fit a tree on each bootstrap sample
  boot_tree <- ctree(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count+ gender, 
                     data = books_corti %>% 
                       slice(boot_idx)) 
  # store bootstraped model
  boot_mods[[i]] <- boot_tree
  # generate predictions for that bootstrap model
  preds_boot <- data.frame(
    preds_boot = predict(boot_tree),
    rowname = boot_idx 
  )  
  # rename prediction to indicate which boot iteration it came from
  names(preds_boot)[1] <- paste("preds_boot",i,sep = "")
  # merge predictions to dataset
  books_boot_preds <- left_join(x = books_boot_preds, y = preds_boot,
                                  by = "rowname")
}

#——————————————————– plot() examines an individual model from bagging #——————————————————–

plot(boot_mods[[1]], gp = gpar(fontsize = 8))

books_boot_preds <- books_boot_preds %>% 
  mutate(preds_bag = 
           select(., preds_boot1:preds_boot100) %>% 
           rowMeans(na.rm = TRUE))

# NOTE: At this point in the code, the model has been bootstrapped.

16) MODEL 4: RANDOM FOREST

rf_fit <- randomForest(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count + gender,
                       data = books_corti,
                       type = regression,
                       mtry = 11/3,
                       ntree = 200,
                       importance = TRUE)

print(rf_fit)
## 
## Call:
##  randomForest(formula = avg_book_rating ~ num_pages + book_ratings_count +      text_reviews_count + title_sentiment_avg + authorworkcount +      author_fans + author_ratings_count + author_review_count +      gender, data = books_corti, type = regression, mtry = 11/3,      ntree = 200, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 200
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.06013366
##                     % Var explained: 24.51
plot(rf_fit)

varImpPlot(rf_fit, type = 1)

plot_min_depth_distribution(rf_fit)

plot_predict_interaction(rf_fit, books_corti, "author_ratings_count", "num_pages")

plot_predict_interaction(rf_fit, books_corti, "authorworkcount", "num_pages")

plot_predict_interaction(rf_fit, books_corti, "num_pages", "title_sentiment_avg")

Storing predictions data frames for Linear and ElasticNet models…

lm_preds_train <- predict(mod1, newdata = books_train)
lm_preds_test <- predict(mod1, 
                         newdata = books_test)
enet_preds_train <- predict(enet_best_mod, 
                             newdata = books_train,  s = "lambda.min")
enet_preds_test <- predict(enet_best_mod,
                      newdata = books_test,  s = "lambda.min")
head(lm_preds_train)
##        1        2        3        4        5        6 
## 4.446006 4.138931 4.157041 3.976757 4.142188 3.992752
head(lm_preds_test)
##       10       20       21       38       39       43 
## 3.874056 3.856204 3.880741 3.859920 3.838027 3.983870
head(enet_preds_train)
##          1
## 1 4.439726
## 2 4.137918
## 3 4.156343
## 4 3.975785
## 5 4.141186
## 6 3.993360
head(enet_preds_test)
##           1
## 10 3.875319
## 20 3.856850
## 21 3.880675
## 38 3.859698
## 39 3.837804
## 43 3.983638

Storing results data frames for Linear and ElasticNet models…

training_predictions <- data.frame(lm_preds_train, enet_preds_train)

results_train <- data.frame(books_train, training_predictions) %>% rename(enet_training = X1)
testing_predictions <- data.frame(
                       "lm_testing" = lm_preds_test,
                        "enet_testing" = enet_preds_test)

results_test <- data.frame(books_test, testing_predictions) %>% rename(enet_testing = X1)

17) GGPLOT OF LINEAR REGRESSION: TRAINING RESULTS

ggplot(results_train, aes(x = avg_book_rating, y = lm_preds_train)) + 
   geom_point(alpha = 1/10, size = 4) +
  theme_minimal(base_size = 16)+
  geom_abline(color = "turquoise")+
  xlab("True Average Ratings")+
  ylab("Predicted Average Ratings")+
  xlim(0, 5) + ylim(0, 5)+
    ggtitle("              Linear Regression: Training True vs Predicted")

18) GGPLOT OF ELASTIC NET: TRAINING RESULTS

ggplot(results_train, aes(x = avg_book_rating, y = enet_preds_train)) + 
   geom_point(alpha = 1/10, size = 4) +
  theme_minimal(base_size = 16)+
  geom_abline(color = "turquoise")+
  xlab("True Average Ratings")+
  ylab("Predicted Average Ratings")+
  xlim(0, 5) + ylim(0, 5)+
    ggtitle("        Best ElasticNet: Training True vs Predicted")

19) GGPLOT OF LINEAR REGRESSION: TESTING RESULTS

ggplot(results_test, aes(x = avg_book_rating, y = lm_preds_test)) + 
  geom_point(alpha = 1/10, size = 4) +
  geom_abline(color = "coral")+
  theme_minimal(base_size = 16)+
  xlab("True Average Ratings")+
  ylab("Predicted Average Ratings")+
  xlim(0, 5) + ylim(0, 5)+
    ggtitle("              Linear Regression: Testing True vs Predicted")

20) GGPLOT OF ELASTIC NET: TESTING RESULTS

ggplot(results_test, aes(x = avg_book_rating, y = enet_preds_test)) + 
  geom_point(alpha = 1/10, size = 4) +
  geom_abline(color = "coral")+
  theme_minimal(base_size = 16)+
  xlab("True Average Ratings")+
  ylab("Predicted Average Ratings")+
  xlim(0, 5) + ylim(0, 5)+
    ggtitle("         Best ElasticNet: Testing True vs Predicted")

#——————————————————– # 21) MODEL EVALUATION #——————————————————–

LINEAR REGRESSION TRAINING METRICS

rmse(books_train, truth = avg_book_rating, estimate = lm_preds_train)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       0.275
mae(books_train, truth = avg_book_rating, estimate = lm_preds_train)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 mae     standard       0.209
rsq(books_train, truth = avg_book_rating, estimate = lm_preds_train)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard      0.0687

LINEAR REGRESSION TEST METRICS

lm_rmse <- rmse(books_test, truth = avg_book_rating, estimate = lm_preds_test)
lm_mae <- mae(books_test, truth = avg_book_rating, estimate = lm_preds_test)
lm_rsq <- rsq(books_test, truth = avg_book_rating, estimate = lm_preds_test)

ELASTIC NET TRAINING METRICS

rmse(books_train, truth = avg_book_rating, estimate = as.vector(enet_preds_train))
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       0.275
mae(books_train, truth = avg_book_rating, estimate = as.vector(enet_preds_train))
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 mae     standard       0.209
rsq(books_train, truth = avg_book_rating, estimate = as.vector(enet_preds_train))
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard      0.0687

ELASTIC NET TEST METRICS

enet_rmse <- rmse(books_test, truth = avg_book_rating, estimate = as.vector(enet_preds_test))
enet_mae <- mae(books_test, truth = avg_book_rating, estimate = as.vector(enet_preds_test))
enet_rsq <- rsq(books_test, truth = avg_book_rating, estimate = as.vector(enet_preds_test))

Tree OUT-OF-BAG Predictions…

books_right_join <- right_join(books_corti, books_boot_preds)
## Joining, by = c("bookID", "title", "author", "avg_book_rating", "isbn", "language_code", "num_pages", "book_ratings_count", "text_reviews_count", "publication_date", "publisher", "title_word_count", "title_sentiment_avg", "authorworkcount", "author_fans", "gender", "born", "died", "avg_author_rating", "author_ratings_count", "author_review_count", "genre")
books_right_join <- books_right_join %>% ungroup()

tree_rmse <- rmse(books_right_join, truth = avg_book_rating, estimate = preds_bag)
tree_mae <- mae(books_right_join, truth = avg_book_rating, estimate = preds_bag)
tree_rsq <- rsq(books_right_join, truth = avg_book_rating, estimate = preds_bag)

Random Forest OUT-OF-BAG Predictions…

preds_OOB <- predict(rf_fit)

rf_rsq <- rsq(books_corti, truth = avg_book_rating, estimate = preds_OOB)
rf_rmse <- rmse(books_corti, truth = avg_book_rating, estimate = preds_OOB)
rf_mae <- mae(books_corti, truth = avg_book_rating, estimate = preds_OOB)

22) MERGING – RSQ, RMSE & MAE COMBINED DATA FRAME

All testing data predictions…

rsq_DF <- merge(rf_rsq, enet_rsq, by=c(".metric", ".estimator")) 
                           
rsq_DF1 <- merge(rsq_DF, lm_rsq, by=c(".metric", ".estimator")) %>% rename("Random Forest" = .estimate.x, "ElasticNet" = .estimate.y, "Linear" = .estimate) 

rsq_DF2 <- merge(rsq_DF1, tree_rsq, by=c(".metric", ".estimator")) %>% select(-.estimator)

print(rsq_DF2)
##   .metric Random Forest ElasticNet     Linear .estimate
## 1     rsq       0.24535 0.06579101 0.06588606 0.1084553
rmse_DF <- merge(rf_rmse, enet_rmse, by=c(".metric", ".estimator"))

rmse_DF1 <- merge(rmse_DF, lm_rmse, by=c(".metric", ".estimator")) %>% rename("Random Forest" = .estimate.x, "ElasticNet" = .estimate.y, "Linear" = .estimate) 

rmse_DF2 <- merge(rmse_DF1, tree_rmse, by=c(".metric", ".estimator")) %>% select(-.estimator)

print(rmse_DF2)
##   .metric Random Forest ElasticNet    Linear .estimate
## 1    rmse     0.2452216  0.2633069 0.2633021 0.2692242
mae_DF <- merge(rf_mae, enet_mae, by=c(".metric", ".estimator"))

mae_DF1 <- merge(mae_DF, lm_mae, by=c(".metric", ".estimator")) %>% rename("Random Forest" = .estimate.x, "ElasticNet" = .estimate.y, "Linear" = .estimate) 

mae_DF2 <- merge(mae_DF1, tree_mae, by=c(".metric", ".estimator")) %>% select(-.estimator)

print(mae_DF2)
##   .metric Random Forest ElasticNet    Linear .estimate
## 1     mae     0.1795154  0.2016244 0.2016341 0.2051364
total <- rbind(rsq_DF2, rmse_DF2)

final <-rbind(total, mae_DF2) %>% rename("Tree" = .estimate, "Metrics" = .metric) 

print(final)
##   Metrics Random Forest ElasticNet     Linear      Tree
## 1     rsq     0.2453500 0.06579101 0.06588606 0.1084553
## 2    rmse     0.2452216 0.26330694 0.26330206 0.2692242
## 3     mae     0.1795154 0.20162440 0.20163410 0.2051364

23) METRICS DATA TABLE

Credit for the code below: https://rfortherestofus.com/2019/11/how-to-make-beautiful-tables-in-r/

Need to load ‘kableExtra’ library.

final %>% kable() %>% kable_styling()
Metrics Random Forest ElasticNet Linear Tree
rsq 0.2453500 0.0657910 0.0658861 0.1084553
rmse 0.2452216 0.2633069 0.2633021 0.2692242
mae 0.1795154 0.2016244 0.2016341 0.2051364

Credit for code below: https://www.littlemissdata.com/blog/prettytables

Need to load ‘formattable’, ‘tidyr’, and ‘data.table’ libraries.

custom_one = "#CCCCFF"
custom_two = "skyblue"
custom_three = "#4ec5a5"
custom_coral = "#FA7268"
# custom_green = "#00AD43"

formattable(final, 
            align =c("l","c","c","c","c", "c", "c", "c", "r"), 
            list(`Metrics` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold")), 
                `Random Forest`= color_tile(custom_one, custom_one), 
              `ElasticNet`= color_tile(custom_two, custom_two), 
              `Linear`= color_tile(custom_three, custom_three), 
              `Tree`= color_tile(custom_coral, custom_coral)
))
Metrics Random Forest ElasticNet Linear Tree
rsq 0.2453500 0.06579101 0.06588606 0.1084553
rmse 0.2452216 0.26330694 0.26330206 0.2692242
mae 0.1795154 0.20162440 0.20163410 0.2051364